package me.xuym.crawler.czjyj;

import com.cbs.java.component.application.Application;
import me.xuym.crawler.Processor;
import me.xuym.crawler.simpleextension.entity.SimpleLink;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Created by broche on 05/08/2017.
 */
public class CZJyjConvertor extends Processor<String, List<SimpleLink>> {
    public CZJyjConvertor(Application application) {
        super(application);
    }

    @Override
    protected List<SimpleLink> onProcess(String content) throws Exception {
        String contentStr = content.replace("\n", "");
        Pattern pattern = Pattern.compile("<tr>\\s*<td width=\"86%\" valign=\"middle\"><a href='([^']*)' target='_blank' title='([^\']*)'>");
        Matcher matcher = pattern.matcher(contentStr);

        List<SimpleLink> results = new ArrayList<>();
        while (matcher.find()) {
            SimpleLink result = new SimpleLink();
            result.setContent(matcher.group(2));
            String url = matcher.group(1);
            if (url.startsWith("/")) {
                url = "http://jyj.changzhou.gov.cn" + url;
            }
            result.setUrl(url);
            results.add(result);
        }
        return results;
    }
}
